{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "67c1e6b1",
   "metadata": {},
   "source": [
    "\n",
    "\n",
    "下面的例子将展示词向量标准工具包——gensim提供的词嵌入，并展示词嵌入如何表示词的相似度。\n",
    "<!-- https://nlp.stanford.edu/projects/glove/ -->"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "5c5a740a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pprint\n",
    "\n",
    "from gensim.models import KeyedVectors\n",
    "\n",
    "# 从GloVe官网下载GloVe向量，此处使用的是glove.6B.zip\n",
    "# 解压缩zip文件并将以下路径改为解压后对应文件的路径\n",
    "model = KeyedVectors.load_word2vec_format('./data/glove.6B.100d.txt', binary=False, no_header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "01a2e4a5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('movie', 0.9055121541023254),\n",
      " ('films', 0.8914433717727661),\n",
      " ('directed', 0.8124362826347351),\n",
      " ('documentary', 0.8075793981552124),\n",
      " ('drama', 0.7929168939590454),\n",
      " ('movies', 0.7889865040779114),\n",
      " ('comedy', 0.7842751145362854),\n",
      " ('starring', 0.7573285102844238),\n",
      " ('cinema', 0.7419455647468567),\n",
      " ('hollywood', 0.7307389974594116)]\n",
      "[('vehicle', 0.8630837798118591),\n",
      " ('truck', 0.8597878813743591),\n",
      " ('cars', 0.837166965007782),\n",
      " ('driver', 0.8185911178588867),\n",
      " ('driving', 0.781263530254364),\n",
      " ('motorcycle', 0.7553156614303589),\n",
      " ('vehicles', 0.7462257146835327),\n",
      " ('parked', 0.74594646692276),\n",
      " ('bus', 0.737270712852478),\n",
      " ('taxi', 0.7155269384384155)]\n"
     ]
    }
   ],
   "source": [
    "# 使用most_similar()找到词表中距离给定词最近（最相似）的n个词\n",
    "pprint.pprint(model.most_similar('film'))\n",
    "pprint.pprint(model.most_similar('car'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "8b62f7ad",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "japanese\n",
      "panda\n",
      "longest\n",
      "terrible\n",
      "queen\n"
     ]
    }
   ],
   "source": [
    "# 利用GloVe展示一个类比的例子\n",
    "def analogy(x1, x2, y1):\n",
    "    # 寻找top-N最相似的词。\n",
    "    result = model.most_similar(positive=[y1, x2], negative=[x1])\n",
    "    return result[0][0]\n",
    "\n",
    "print(analogy('china', 'chinese', 'japan'))\n",
    "print(analogy('australia', 'koala', 'china'))\n",
    "print(analogy('tall', 'tallest', 'long'))\n",
    "print(analogy('good', 'fantastic', 'bad'))\n",
    "print(analogy('man', 'woman', 'king'))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0c308cee",
   "metadata": {},
   "source": [
    "下面将展示word2vec的代码，包括文本预处理、skipgram算法的实现、以及使用PyTorch进行优化。这里使用《小王子》这本书作为训练语料。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "590fc408",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to C:\\Users\\WuMing/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "# 安装NLTK，使用如下代码下载punkt组件\n",
    "import nltk\n",
    "nltk.download('punkt')\n",
    "\n",
    "from nltk.tokenize import sent_tokenize, word_tokenize\n",
    "from collections import defaultdict\n",
    "\n",
    "# 使用类管理数据对象，包括文本读取、文本预处理等\n",
    "class TheLittlePrinceDataset:\n",
    "    def __init__(self, tokenize=True):\n",
    "        # 利用NLTK函数进行分句和分词\n",
    "        text = open('./data/the little prince.txt', 'r', encoding='utf-8').read()\n",
    "        if tokenize:\n",
    "            self.sentences = sent_tokenize(text.lower())\n",
    "            self.tokens = [word_tokenize(sent) for sent in self.sentences]\n",
    "        else:\n",
    "            self.text = text\n",
    "\n",
    "    def build_vocab(self, min_freq=1):\n",
    "        # 统计词频\n",
    "        frequency = defaultdict(int)\n",
    "        for sentence in self.tokens:\n",
    "            for token in sentence:\n",
    "                frequency[token] += 1\n",
    "        self.frequency = frequency\n",
    "\n",
    "        # 加入<unk>处理未登录词，加入<pad>用于对齐变长输入进而加速\n",
    "        self.token2id = {'<unk>': 1, '<pad>': 0}\n",
    "        self.id2token = {1: '<unk>', 0: '<pad>'}\n",
    "        for token, freq in sorted(frequency.items(), key=lambda x: -x[1]):\n",
    "            # 丢弃低频词\n",
    "            if freq > min_freq:\n",
    "                self.token2id[token] = len(self.token2id)\n",
    "                self.id2token[len(self.id2token)] = token\n",
    "            else:\n",
    "                break\n",
    "\n",
    "    def get_word_distribution(self):\n",
    "        distribution = np.zeros(vocab_size)\n",
    "        for token, freq in self.frequency.items():\n",
    "            if token in dataset.token2id:\n",
    "                distribution[dataset.token2id[token]] = freq\n",
    "            else:\n",
    "                # 不在词表中的词按<unk>计算\n",
    "                distribution[1] += freq\n",
    "        distribution /= distribution.sum()\n",
    "        return distribution\n",
    "\n",
    "    # 将分词结果转化为索引表示\n",
    "    def convert_tokens_to_ids(self, drop_single_word=True):\n",
    "        self.token_ids = []\n",
    "        for sentence in self.tokens:\n",
    "            token_ids = [self.token2id.get(token, 1) for token in sentence]\n",
    "            # 忽略只有一个token的序列，无法计算loss\n",
    "            if len(token_ids) == 1 and drop_single_word:\n",
    "                continue\n",
    "            self.token_ids.append(token_ids)\n",
    "        \n",
    "        return self.token_ids\n",
    "\n",
    "dataset = TheLittlePrinceDataset()\n",
    "dataset.build_vocab(min_freq=1)\n",
    "sentences = dataset.convert_tokens_to_ids()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "efc882de",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(76044, 2) [[  4  16]\n",
      " [  4  19]\n",
      " [ 16   4]\n",
      " ...\n",
      " [130   3]\n",
      " [  3  86]\n",
      " [  3 130]]\n"
     ]
    }
   ],
   "source": [
    "# 遍历所有的中心词-上下文词对\n",
    "window_size = 2\n",
    "data = []\n",
    "\n",
    "for sentence in sentences:\n",
    "    for i in range(len(sentence)):\n",
    "        for j in range(i-window_size, i+window_size+1):\n",
    "            if j == i or j < 0 or j >= len(sentence):\n",
    "                continue\n",
    "            center_word = sentence[i]\n",
    "            context_word = sentence[j]\n",
    "            data.append([center_word, context_word])\n",
    "\n",
    "# 需要提前安装numpy\n",
    "import numpy as np\n",
    "data = np.array(data)\n",
    "print(data.shape, data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "30903b3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 需要提前安装PyTorch\n",
    "import torch\n",
    "from torch import nn\n",
    "import torch.nn.functional as F\n",
    "\n",
    "# 实现skipgram算法，使用对比学习计算损失\n",
    "class SkipGramNCE(nn.Module):\n",
    "    def __init__(self, vocab_size, embed_size, distribution,\\\n",
    "                 neg_samples=20):\n",
    "        super(SkipGramNCE, self).__init__()\n",
    "        print(f'vocab_size = {vocab_size}, embed_size = {embed_size}, '+\\\n",
    "              f'neg_samples = {neg_samples}')\n",
    "        self.input_embeddings = nn.Embedding(vocab_size, embed_size)\n",
    "        self.output_embeddings = nn.Embedding(vocab_size, embed_size)\n",
    "        distribution = np.power(distribution, 0.75)\n",
    "        distribution /= distribution.sum()\n",
    "        self.distribution = torch.tensor(distribution)\n",
    "        self.neg_samples = neg_samples\n",
    "        \n",
    "    def forward(self, input_ids, labels):\n",
    "        i_embed = self.input_embeddings(input_ids)\n",
    "        o_embed = self.output_embeddings(labels)\n",
    "        batch_size = i_embed.size(0)\n",
    "        n_words = torch.multinomial(self.distribution, batch_size * \\\n",
    "            self.neg_samples, replacement=True).view(batch_size, -1)\n",
    "        n_embed = self.output_embeddings(n_words)\n",
    "        pos_term = F.logsigmoid(torch.sum(i_embed * o_embed, dim=1))\n",
    "        # 负采样，用于对比学习\n",
    "        neg_term = F.logsigmoid(- torch.bmm(n_embed, \\\n",
    "            i_embed.unsqueeze(2)).squeeze())\n",
    "        neg_term = torch.sum(neg_term, dim=1)\n",
    "        loss = - torch.mean(pos_term + neg_term)\n",
    "        return loss"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "1d9da6c8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0.00000000e+00 5.43983724e-02 5.34295679e-02 ... 9.68804495e-05\n",
      " 9.68804495e-05 9.68804495e-05]\n",
      "vocab_size = 1078, embed_size = 128, neg_samples = 20\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "epoch:   0%|                        | 0/100 [00:00<?, ?it/s]C:\\Users\\WuMing\\AppData\\Local\\Temp\\ipykernel_37080\\2573102017.py:42: UserWarning: Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.\n",
      "Consider using tensor.detach() first. (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\pytorch\\aten\\src\\ATen\\native\\Scalar.cpp:23.)\n",
      "  pbar.set_description(f'epoch-{epoch}, loss={loss.item():.4f}')\n",
      "epoch-99, loss=3.4743: 100%|█| 100/100 [01:43<00:00,  1.03s/\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjIAAAGwCAYAAACzXI8XAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8ekN5oAAAACXBIWXMAAA9hAAAPYQGoP6dpAABLOUlEQVR4nO3dd3hUVf4G8Hf6pE0apEFClx56CaAgoDQRFBuioosdFGSt6+r+rLAWbETssCiKyyqoKCIChmIKBEInEAgkJCSB9DrJzJzfH5O5MKaHuTOZ5P08Tx7J3DuTk4tk3pzv956jEEIIEBEREbkhpasHQERERNRcDDJERETkthhkiIiIyG0xyBAREZHbYpAhIiIit8UgQ0RERG6LQYaIiIjcltrVA5CbxWJBZmYmfHx8oFAoXD0cIiIiagQhBIqLixEWFgalsu55l1YfZDIzMxEeHu7qYRAREVEzpKeno2PHjnUeb/VBxsfHB4D1QhgMBhePhoiIiBqjqKgI4eHh0vt4XVp9kLGVkwwGA4MMERGRm2moLYTNvkREROS2GGSIiIjIbTHIEBERkdtikCEiIiK3xSBDREREbotBhoiIiNwWgwwRERG5LQYZIiIiclsMMkREROS2GGSIiIjIbTHIEBERkdtikCEiIiK3xSDjQJUmC0xmi6uHQURE1GYwyDhIRkE5Jiz7A1Pf3wmLRbh6OERERG2C2tUDaA2KKqpw38oEpOeVAwBKK03w0WtcPCoiIqLWjzMyV6jSZMEjXyXiRHaJ9FiVmTMyREREzsAgcwWEEHh+/SHsTsmFp1YFhcL6OPtkiIiInINB5gos35aCdYnnoFQA0XcOhlZlvZyVDDJEREROwSDTTOv3n8PbW04AAF6e0Q/X9gqSggxLS0RERM7BINMMQghsOpQFAHjomq64a2QnAIBaZa0tsbRERETkHLxrqRkUCgU+nDMY/0s8h9uGhkuPa1haIiIicioGmWZSq5S4Y3iE3WO2IGNiaYmIiMgpWFpyIE11aamKMzJEREROwSDjQCwtEREROReDjAOxtERERORcDDIOxNISERGRczHIOJBGWkeGQYaIiMgZGGQcSC3NyLC0RERE5AwMMg7EGRkiIiLnYpBxIC2DDBERkVMxyDgQS0tERETOxSDjQCwtEREROReDjANpuY4MERGRUzHIOJCttMSVfYmIiJyDQcaBWFoiIiJyLgYZB+IWBURERM7FIONA3KKAiIjIuRhkHIi7XxMRETkXg4wDqVlaIiIicioGGQfSsrRERETkVAwyDnTpriXOyBARETkDg4wDqXn7NRERkVMxyDgQS0tERETO1WKCzNKlS6FQKLBo0SLpsYqKCsyfPx+BgYHw9vbGrFmzkJ2d7bpBNkDN0hIREZFTtYggs2fPHnz88ceIjIy0e/yJJ57ATz/9hHXr1iEmJgaZmZm4+eabXTTKhnFlXyIiIudyeZApKSnBnDlz8Omnn8Lf3196vLCwEJ9//jmWLVuG8ePHY8iQIVi5ciX+/PNPxMXFuXDEdeOCeERERM7l8iAzf/58TJs2DRMnTrR7PDExEVVVVXaP9+rVCxEREYiNja3z9YxGI4qKiuw+nIVbFBARETmX2pVffO3atdi3bx/27NlT41hWVha0Wi38/PzsHg8ODkZWVladr7lkyRK89NJLjh5qo3BlXyIiIudy2YxMeno6Fi5ciDVr1kCv1zvsdZ977jkUFhZKH+np6Q577YbYSksmC4MMERGRM7gsyCQmJiInJweDBw+GWq2GWq1GTEwM3n//fajVagQHB6OyshIFBQV2z8vOzkZISEidr6vT6WAwGOw+nEVq9jWxtEREROQMListTZgwAYcOHbJ77L777kOvXr3wzDPPIDw8HBqNBlu3bsWsWbMAAMnJyUhLS0NUVJQrhtwg3rVERETkXC4LMj4+PujXr5/dY15eXggMDJQenzdvHhYvXoyAgAAYDAY89thjiIqKwsiRI10x5AapbXctsbRERETkFC5t9m3IO++8A6VSiVmzZsFoNGLSpEn48MMPXT2sOmlZWiIiInKqFhVk/vjjD7vP9Xo9oqOjER0d7ZoBNRFLS0RERM7l8nVkWhM1F8QjIiJyKgYZB9JyryUiIiKnYpBxIGllXzb7EhEROQWDjANdKi0JCMFZGSIiIrkxyDiQbUYGYHmJiIjIGRhkHMi2RQHA8hIREZEzMMg4kN2MDNeSISIikh2DjAOplZdmZLgDNhERkfwYZBxIoVBwB2wiIiInYpBxMO6ATURE5DwMMg5mCzIsLREREcmPQcbBWFoiIiJyHgYZB2NpiYiIyHkYZBxMWt2XMzJERESyY5BxsEszMgwyREREcmOQcTDugE1EROQ8DDIOxtISERGR8zDIOBhLS0RERM7DIONgGpaWiIiInIZBxsG4jgwREZHzMMg4mLSyL0tLREREsmOQcTC10npJTRaWloiIiOTGIONgWnX1XUvca4mIiEh2DDIOxtISERGR8zDIOBhLS0RERM7DIONgUmmJMzJERESyY5BxsEvryDDIEBERyY1BxsFspaUqlpaIiIhkxyDjYBqWloiIiJyGQcbBNGz2JSIichoGGQeTbr9mjwwREZHsGGQcjKUlIiIi52GQcTCWloiIiJyHQcbBbLtfs7REREQkPwYZB9Ooq2+/ZmmJiIhIdgwyDsbSEhERkfMwyDiYhrtfExEROQ2DjINx92siIiLnYZBxMO5+TURE5DwMMg6mZWmJiIjIaRhkHEzaNNLMGRkiIiK5Mcg4mK1HhjMyRERE8mOQcTCWloiIiJyHQcbBpGZflpaIiIhkxyDjYNz9moiIyHkYZByMpSUiIiLnYZBxMJaWiIiInIdBxsFsm0aytERERCQ/BhkH0yitpSUTgwwREZHsGGQczNbsaxGAmdsUEBERyYpBxsFspSWADb9ERERyY5BxMHV1aQlgkCEiIpIbg4yD2UpLAPdbIiIikhuDjIOplAqolFxLhoiIyBkYZGSgZpAhIiJyCgYZGWilHbBZWiIiIpITg4wM1CquJUNEROQMDDIy4MaRREREzsEgIwMNS0tEREROwSAjAw1LS0RERE7BICMDlpaIiIicg0FGBiwtEREROQeDjAxYWiIiInIOBhkZXJqRYZAhIiKSE4OMDGzryFSytERERCQrBhkZ2GZkWFoiIiKSF4OMDLQsLRERETkFg4wMbKUl3rVEREQkLwYZGbDZl4iIyDkYZGTA0hIREZFzuDTIrFixApGRkTAYDDAYDIiKisKmTZuk4xUVFZg/fz4CAwPh7e2NWbNmITs724UjbhyWloiIiJzDpUGmY8eOWLp0KRITE7F3716MHz8eM2bMwJEjRwAATzzxBH766SesW7cOMTExyMzMxM033+zKITcKS0tERETOoXblF58+fbrd56+99hpWrFiBuLg4dOzYEZ9//jm+/vprjB8/HgCwcuVK9O7dG3FxcRg5cqQrhtwoDDJERETO0WJ6ZMxmM9auXYvS0lJERUUhMTERVVVVmDhxonROr169EBERgdjY2Dpfx2g0oqioyO7D2S5tUcDSEhERkZxcHmQOHToEb29v6HQ6PPzww1i/fj369OmDrKwsaLVa+Pn52Z0fHByMrKysOl9vyZIl8PX1lT7Cw8Nl/g5q4u7XREREzuHyINOzZ08kJSUhPj4ejzzyCObOnYujR482+/Wee+45FBYWSh/p6ekOHG3jqKWVfTkjQ0REJCeX9sgAgFarRffu3QEAQ4YMwZ49e/Dee+/h9ttvR2VlJQoKCuxmZbKzsxESElLn6+l0Ouh0OrmHXS+tdNcSZ2SIiIjk5PIZmb+yWCwwGo0YMmQINBoNtm7dKh1LTk5GWloaoqKiXDjChrG0RERE5BwunZF57rnnMGXKFERERKC4uBhff/01/vjjD2zevBm+vr6YN28eFi9ejICAABgMBjz22GOIiopq0XcsASwtEREROYtLg0xOTg7uuecenD9/Hr6+voiMjMTmzZtx3XXXAQDeeecdKJVKzJo1C0ajEZMmTcKHH37oyiE3CktLREREzuHSIPP555/Xe1yv1yM6OhrR0dFOGpFjqLmODBERkVO0uB6Z1uDSgngsLREREcmJQUYGGpaWiIiInIJBRgYaNvsSERE5BYOMDHj7NRERkXMwyMiApSUiIiLnYJCRAUtLREREzsEgIwMNb78mIiJyCgYZGairS0vskSEiIpIXg4wMtCwtEREROQWDjAxYWiIiInIOBhkZsLRERETkHAwyMmBpiYiIyDkYZGTA0hIREZFzMMjIwFZaMlkEhOCsDBERkVwYZGRgm5EBuAM2ERGRnBhkZGDbogBgeYmIiEhODDIyuHxGhg2/RERE8mGQkYFaeWlGhrdgExERyYdBRgYKhYI7YBMRETkBg4xMuAM2ERGR/BhkZGILMiwtERERyYdBRiYaaS0ZBhkiIiK5MMjIRFrd18TSEhERkVwYZGTCjSOJiIjkxyAjk0vNvgwyREREcmGQkYlW2jiSpSUiIiK5MMjIRM11ZIiIiGTHICMTqdmXQYaIiEg2DDIy0bC0REREJDsGGZlwHRkiIiL5McjIRFrZ18QgQ0REJBcGGZmwtERERCQ/BhmZsLREREQkPwYZmbC0REREJD8GGZmolSwtERERyY1BRiZadXVpievIEBERyYZBRiZcEI+IiEh+DDIykUpLFpaWiIiI5NKsIPOf//wHP//8s/T5008/DT8/P4waNQpnz5512ODcmaa6tFTFZl8iIiLZNCvIvP766/Dw8AAAxMbGIjo6Gm+88QbatWuHJ554wqEDdFdalpaIiIhkp27Ok9LT09G9e3cAwIYNGzBr1iw8+OCDGD16NMaNG+fI8bktlpaIiIjk16wZGW9vb+Tm5gIAfvvtN1x33XUAAL1ej/LycseNzo2xtERERCS/Zs3IXHfddbj//vsxaNAgnDhxAlOnTgUAHDlyBJ07d3bk+NyWRsnSEhERkdyaNSMTHR2NqKgoXLhwAd999x0CAwMBAImJiZg9e7ZDB+iubFsUsLREREQkn2bNyPj5+WH58uU1Hn/ppZeueECthUZdPSPD0hIREZFsmjUj8+uvv2LXrl3S59HR0Rg4cCDuvPNO5OfnO2xw7oylJSIiIvk1K8g89dRTKCoqAgAcOnQIf//73zF16lSkpqZi8eLFDh2gu7I1+5pYWiIiIpJNs0pLqamp6NOnDwDgu+++ww033IDXX38d+/btkxp/2zrufk1ERCS/Zs3IaLValJWVAQB+//13XH/99QCAgIAAaaamrbOtI8MZGSIiIvk0a0ZmzJgxWLx4MUaPHo2EhAR8++23AIATJ06gY8eODh2gu7Ltfs0eGSIiIvk0a0Zm+fLlUKvV+N///ocVK1agQ4cOAIBNmzZh8uTJDh2gu7LNyLC0REREJJ9mzchERERg48aNNR5/5513rnhArYWtR4alJSIiIvk0K8gAgNlsxoYNG3Ds2DEAQN++fXHjjTdCpVI5bHDujKUlIiIi+TUryKSkpGDq1KnIyMhAz549AQBLlixBeHg4fv75Z3Tr1s2hg3RH0qaRLC0RERHJplk9Mo8//ji6deuG9PR07Nu3D/v27UNaWhq6dOmCxx9/3NFjdEu20hK3KCAiIpJPs2ZkYmJiEBcXh4CAAOmxwMBALF26FKNHj3bY4NwZS0tERETya9aMjE6nQ3FxcY3HS0pKoNVqr3hQrYG0joyZMzJERERyaVaQueGGG/Dggw8iPj4eQggIIRAXF4eHH34YN954o6PH6JZsm0ZWckaGiIhINs0KMu+//z66deuGqKgo6PV66PV6jBo1Ct27d8e7777r4CG6J42SpSUiIiK5NatHxs/PDz/88ANSUlKk26979+6N7t27O3Rw7szW7CsEYLYIqKqDDRERETlOo4NMQ7tab9++XfrzsmXLmj+iVsJWWgKsszIqJdfXISIicrRGB5n9+/c36jyFgjMPAKC+bAam0myBXsMgQ0RE5GiNDjKXz7hQw2ylJYB3LhEREcmlWc2+1DCVUiH1xbDhl4iISB4MMjJSM8gQERHJikFGRlrbNgUsLREREcmCQUZGahVnZIiIiOTEICMjaeNIBhkiIiJZMMjISMPSEhERkaxcGmSWLFmCYcOGwcfHB0FBQZg5cyaSk5PtzqmoqMD8+fMRGBgIb29vzJo1C9nZ2S4acdNoWFoiIiKSlUuDTExMDObPn4+4uDhs2bIFVVVVuP7661FaWiqd88QTT+Cnn37CunXrEBMTg8zMTNx8880uHHXjsbREREQkr2btteQov/76q93nq1atQlBQEBITE3HNNdegsLAQn3/+Ob7++muMHz8eALBy5Ur07t0bcXFxGDlypCuG3WgsLREREcmrRfXIFBYWAgACAgIAAImJiaiqqsLEiROlc3r16oWIiAjExsbW+hpGoxFFRUV2H64ilZZMnJEhIiKSQ4sJMhaLBYsWLcLo0aPRr18/AEBWVha0Wi38/Pzszg0ODkZWVlatr7NkyRL4+vpKH+Hh4XIPvU62GRmThUGGiIhIDi0myMyfPx+HDx/G2rVrr+h1nnvuORQWFkof6enpDhph09nWkalkaYmIiEgWLu2RsVmwYAE2btyIHTt2oGPHjtLjISEhqKysREFBgd2sTHZ2NkJCQmp9LZ1OB51OJ/eQG0WakWGzLxERkSxcOiMjhMCCBQuwfv16bNu2DV26dLE7PmTIEGg0GmzdulV6LDk5GWlpaYiKinL2cJtMy7uWiIiIZOXSGZn58+fj66+/xg8//AAfHx+p78XX1xceHh7w9fXFvHnzsHjxYgQEBMBgMOCxxx5DVFRUi79jCWBpiYiISG4uDTIrVqwAAIwbN87u8ZUrV+Lee+8FALzzzjtQKpWYNWsWjEYjJk2ahA8//NDJI20elpaIiIjk5dIgI0TDMxV6vR7R0dGIjo52wogci6UlIiIiebWYu5Zao0u7X7O0REREJAcGGRlxiwIiIiJ5McjIiEGGiIhIXgwyMrJtUWBiaYmIiEgWDDIyss3IVHJGhoiISBYMMjJSs7REREQkKwYZGQV6aQEAZ3PLXDwSIiKi1olBRkYjuwYCABJS81BRZXbxaIiIiFofBhkZXRXsjfY+OhhNFuw7m+/q4RAREbU6DDIyUigUGNO9HQBgZ8pFF4+GiIio9WGQkZktyOxmkCEiInI4BhmZjelhDTKHMgqRX1pZ43habhlmrfgTPx7IdPbQiIiI3B6DjMyCDXr0CPKGEEDs6dwax6O3pyDxbD6+jD3j/MERERG5OQYZJ7DNyuw8aV9eKiyrwg8HMgAAmQUVTh8XERGRu2OQcQJbn8yulAt2j3+37xwqqqyL5WUXVcBs4VYGRERETcEg4wQjugZCrVQgPa8cadWL4wkh8FX8Wekck0XgQrHRVUMkIiJySwwyTuCtU2NQhB8AYGf1rEzsqVycvlAKL60K7bytKwBnFJS7aohERERuiUHGScZ0bw/g0m3YX8ZZZ2NuHtwRXdt5AwAyGWSIiIiahEHGSWwNv7tTcpFZUI7fjmYDAO4a2QmhfnoADDJERERNxSDjJAM6+sJHp0ZheRVe2HAYZovA8M4B6BnigzA/DwDA+ULeuURERNQUDDJOolYpMbKbdRPJrcdzAAB3RXUCACnIsEeGiIioaRhknMh2GzYAtPPWYnLfEABAB5aWiIiImoVBxolsfTIAcPuwcGjV1stvm5FhkCEiImoaBhkn6trOC33DDPDRq3HniE7S46G+1iCTX1aF8kqzq4ZHRETkdtSuHkBbolAo8O1DUTBWmRHorZMeN+jV8NapUWI0IbOwHN3ae7twlERERO6DMzJO5q1T24UYwBpwwtgnQ0RE1GQMMi0E+2SIiIiajkGmhbD1yWRwF2wiIqJGY5BpIXgLNhERUdMxyLQQLC0RERE1HYNMC8FtCoiIiJqOQaaF6HDZNgVCiGa9xoViI8oqTY4cFhERUYvGINNCBBv0UCiASpMFuaWVTX5+YVkVrnljO+74JE6G0REREbVMDDIthFatRPvq9WWa0yeTmluK8iozkrOKHT00IiKiFotBpgW5kobfgjLrLI7RZEGV2eLQcREREbVUDDItSAcpyDS94bewvEr6c6mRfTJERNQ2MMi0IFeyTUH+ZX01xRUMMkRE1DYwyLQgttV9MwubUVq6fEaGdy4REVEbwSDTgoT5NX+bgoKyS0GmhDMyRETURjDItCAdHNDsCwDF7JEhIqI2gkGmBbH1yFwoNsJoMjfpuZeXljgjQ0REbQWDTAsS4KWFTm39K8kuNDbpuZeXlnjXEhERtRUMMi2IQqG4rE+maeWly0tLJQwyRETURjDItDDNvQX78tISb78mIqK2gkGmhQnzbXrDr8UiuCAeERG1SQwyLYy0TUFh42/BLqqowuUbZrO0REREbQWDTAvTnFuwL2/0BXj7NRERtR0MMi1MaDN6ZPIva/QFWFoiIqK2g0Gmhbl8B2xxeb2oHpc3+gJcR4aIiNoOBpkWxtbsW1ppRlF54wJJYXVpSamwfs4eGSIiaisYZFoYD60KAV5aAI1fS8ZWWrJtOskgQ0REbQWDTAsUYrD2yWQVNS7I2Jp9O/ozyBARUdvCINMCBRl0AICLxZUNnGllW0Omo78nAGuPTGP7a4iIiNwZg0wL1N7bGmQulDRuvyVback2I2OyCBhNFnkGR0RE1IIwyLRA7X2qg0xx44KMrbRkW4MGYHmJiIjaBgaZFqjJQaa6tOTvpYWXVgWAt2ATEVHbwCDTAjV9RsZaWvL31MBbrwbAGRkiImobGGRaoKb2yNhKS36eGnjrGGSIiKjtYJBpgZoyI2O2CBRVWIOMr4f2UpBhaYmIiNoABpkWyBZkSowmlFXWH0iKyi/tfO13WWmptIHnERERtQYMMi2Qt04Nndr6V9PQWjK2Rl9vnRoalVKakSnmjAwREbUBDDItkEKhuFReKqmo91xbo6+vhwYA4MUeGSIiakMYZFqoxvbJ2Bp9/b2sQcanOsiUMsgQEVEbwCDTQkl3LjUUZMqtMzJ+HtaNJm09MiwtERFRW8Ag00I1dUbG15OlJSIiansYZFqoSz0y9Tf75ttKS54sLRERUdvDINNCNXZGprCs9tISZ2SIiKgtYJBpoRq7um/+Zav6AoCXlj0yRETUdjDItFC2GZmLDTb72oIMZ2SIiKjtYZBpoS4vLQnb0r21uFRasvXIWP/LHhkiImoLXBpkduzYgenTpyMsLAwKhQIbNmywOy6EwIsvvojQ0FB4eHhg4sSJOHnypGsG62TtqktLlWYLisrrDiU1Sks6FQDutURERG2DS4NMaWkpBgwYgOjo6FqPv/HGG3j//ffx0UcfIT4+Hl5eXpg0aRIqKupf7bY10GtUMFSXiepb3de2sm+N0lKlqd6ZHCIiotZA7covPmXKFEyZMqXWY0IIvPvuu/jnP/+JGTNmAABWr16N4OBgbNiwAXfccYczh+oS7X10KKowIafYiO5BPjWOW3e+ts68+Hnal5aEAMoqzdK6MkRERK1Ri+2RSU1NRVZWFiZOnCg95uvrixEjRiA2NrbO5xmNRhQVFdl9uKuGbsEurG70BS7ttaTXKKFUWB9jwy8REbV2LTbIZGVlAQCCg4PtHg8ODpaO1WbJkiXw9fWVPsLDw2Udp5za++gB1B1kbGUln+qdrwHrhpPcAZuIiNqKFhtkmuu5555DYWGh9JGenu7qITVbQ2vJ2G69tm1PYOOj551LRETUNrTYIBMSEgIAyM7Otns8OztbOlYbnU4Hg8Fg9+GuGiotXWr0tQ8y0p1LDDJERNTKtdgg06VLF4SEhGDr1q3SY0VFRYiPj0dUVJQLR+Y8DQcZ2z5LWrvHWVoiIqK2wqW3tJSUlCAlJUX6PDU1FUlJSQgICEBERAQWLVqEV199FT169ECXLl3wwgsvICwsDDNnznTdoJ2osUHG1uhr483SEhERtREuDTJ79+7FtddeK32+ePFiAMDcuXOxatUqPP300ygtLcWDDz6IgoICjBkzBr/++iv0er2rhuxUth6Zi3X1yNRRWvJmaYmIiNoIlwaZcePG1btom0KhwMsvv4yXX37ZiaNqOWwzMrmllTCZLVCr7CuBtmbfukpLDDJERNTatdgeGQICvLRQKqyL2+WVVtY4XmdpqXpRPAYZIiJq7RhkWjCVUoHA6vJSTi19Mvl/2Z7Axpv7LRERURvBINPC1beWTKFUWvprsy9LS0RE1DYwyLRw9d25lF9nsy9LS0RE1DYwyLRw7bzrDjKXemT+UlqyzciwtERERK0cg0wLV9eMjMlskRa8q1Fa4u3XRETURjDItHC2IPPXtWRq2/naxlZa4oJ4RETU2jHItHB1zcjY1pDx0alrrC8jbVHAIENERK0cg0wLV9ddS7b+GD8vTY3nSAvisUeGiIhaOQaZFq7OGRnbHUt/afQFLjX7lleZYbbUvXIyERGRu2OQaeFsQaa4woSKKrP0uDQj41lzRsarutkXYMMvERG1bgwyLZxBr4ZWbf1runxWxtYj89dVfQFAp1ZBW903wyBDREStGYNMC6dQKGrtk7lUWqo5IwPUvZZMVmEFsgor5BgqERGR0zHIuIHa+mTqKy0Bte+AXVFlxg0f7MKNy3fZlamIiIjcFYOMG6gtyOSWWv9cW2kJALxqCTLHzhfhYokROcVGHD1fJNdwiYiInIZBxg1cHmTMFoG3Nifjl0NZAIBOAZ61PsenlluwD2deCi8H0gtkGi0REZHzqF09AGqYrUfmRHYx7l2ZgJ0nLwIA7h3VGeN7BdX6HFuPzOWr+x7JKJT+nMQgQ0RErQCDjBuwzchsOmydhfHQqLB0Vn/MGNihzud41bK67+HMS0GGMzJERNQaMMi4AVuQAYCu7bzw0d1DcFWwT73P+evqvkaTGclZxdLxM7llyC+thL9X7T02RERE7oA9Mm5gWOcAdGvvhZkDw/DDgtENhhgA8LGVliqtQeZkdgmqzAK+Hhp0becFADhwrkC2MRMRETkDZ2TcQICXFlv/Pq5Jz/HSVpeWqmdkDlf3x/TrYECwjx6nL5YiKb0A43rW3mNDRETkDjgj00pJC+JV98jY+mP6hfliQLgfAPbJEBGR++OMTCtlu/3adtfS4Qzrrdd9O/giovqW7aT0AgghoFAoXDNIIiKiK8QZmVbK67JmX5PZgmPVC+D1CzOgd6gPtCol8suqkJ5X7sphEhERXREGmVbKVloqNppw6kIpjCYLvLQqdA70gk6tQu8wAwBgf3q+K4dJRER0RRhkWinvy0pLtkbfvmG+UCqtZaRBUp9MYa3Pb4jJbEF+aeWVD5SIiOgKMMi0UpdvGmlr9O3bwSAdHxDuCwBIasaMTEJqHsa/HYPhr/+OMxdLHTBaIiKi5mGQaaWku5YqTDiSYeuP8ZWODwz3B2Ddf6nKbGnUa1ZUmfHqxqO4/ZNYpOWVocossO14joNHTkRE1HgMMq2UbUam0mzBIWkNmUtBpnOgJ3w9NKg0WXD8fHGtr3G5/Wn5mPr+Tny2KxVCAF2qF9XbezZPhtETERE1DoNMK+WlVUl/Lq8yQ69Rolt7L+kxhUIhrSeT1MAKv/vT8nHLR7E4faEUQT46fHHvUPx7ViQAICE1H0IIh4+fiIioMRhkWim1SgkPzaUw0zvUALXK/q97YMfqPpm0gnpf6797z8FsERjdPRC/PXENxvcKRmRHX2hVSlwsMeJMbpnDx09ERNQYDDKtmK1PBrDvj7EZGOEHoP49lywWga3HsgEAD1zdFX6e1k0m9RoVIquD0J5UlpeIiMg1GGRaMVufDGDdY+mvBnT0AwCculCCooqqWl/jcGYhcoqN8NSqMLJroN2xYV0CAAB7ztQeZE5dKMFnO083upmYiIioqRhkWrHLg0zfWmZkAr11CA/wgBDAoXO1ryfz+zHrXUlX92gH/WWlKgAY3rn+IPPEt0l49edjWB17tlnjJyIiagiDTCtmCzIalQJXBfvUeo5tVmZ/Wu3rydjKShN7B9c4NriTPxQK4ExuGXKKK+yOpeQU42B1ONqwP6NZ4yciImoIg0wrZttvqWeID7Tq2v+qbeWi7/dlwGyxv/vofGE5jmQWQaEAru0VVOO5vh4a9KwOSHtS7YPQ+svCy6GMQqTkNHyLNxERUVMxyLRiPtXNvrU1+trcNKgDDHo1Tl8sxZaj2XbHtlaXlQaF+6Gdt67W5w+vpU/GYhHYsD8TgDXsAJA+JyIiciQGmVZsRJcAaFQKXN+3ZlnIxkunxt1RnQAAH8WcslsT5ndbWalP3c8fVkufzJ4zecgoKIePTo1/TusNANiQlAGLhevNEBGRYzHItGJ3DI/AkZcmY3yvuoMIAMwd1RlatRJJ6QXYc8ZaIiqrNOHPU7kAau+PsbEFmWPni1BcfeeTraw0pX8IbogMg7dOjXP55Uisow/HEVJyinHTh7vx/b5zsn0NIiJqeRhkWrm6emMuF+Sjx6zBHQEAn+w4BQDYefIiKk0WhAd4oEeQd53PDfHVIzzAAxYBJJ7NR0WVGT8fOg8AmDmoAzy0KkzuFwJAvqZfs0Xg7/89gP1pBXju+0NIdfBGluWVZvx56iJnlIiIWiAGGQIAPHB1FygU1tutT2YXS3crTegVDIVCUe9zbbMye8/kY9vxHBRXmBDmq8fILtZG4psGdQAAbDx4HpUm+zVlTGYLjp0vuqKQsDr2DA5U3yFlNFnwzP8OOjR0/N+PR3Dnp/H4149H6jwnu6gC0dtTkF1UUec5RETkeAwyBADo2t4b11f3wnwUcxrbjl8AAFxXT3+MjW09mYQzeVJZacagDlAqrQFoZNdABBt0KCyvwh/Jl3bLLjWacM8XCZjy3k7c8Ulcs2ZSMgrK8ebmZADAI+O6wVOrQsKZPKyJb9zaNb8dycIbvx5HidFU6/Gcogp8v99arvoy7ix+SKo5q1RQVok7P43Dm5uT8eDqvTXu/iIiIvkwyJDkobHdAADf7TuHiyVG+OjU0mxLfWwr/CalF0hBxTYLAwAqpQIzBlo/31AdBIoqqnDPFwlSH07CmTxMeW8HPtt5utFBQAiBFzYcRlmlGUM7+eOp63vi6Uk9AQBLNx3Hufy694C6WGLEo2sS8eCXifjwj1N4d8uJWs9bHXsWVWYh7Vv17HeHcCL70q3kFVVmPLg6EacuWEPYgXOFWB17plHjp7blrc3JuPvzeBSUVTb6OVmFFZj9SRwWf5uEzIJyGUfnPowmc50rkVPbxCBDksER/tLsCgBc07N9o3psurbzQqCXFpUmC6rMAn3DDDUW4JtZHWR+P5aDtNwyzPk0Holn82HQq/HRXYMxunsgKqosePXnY7j1oz/x04FM/OfPM/j3r8ex+Nsk3P+fvVgTfxbllWbpNX8+dB7bjudAo1Jgyc39oVQqcE9UZwzt5I/SSjP+sf5wjZ25hRD4ISkD1y2LwS+HslA9aYTVcWdrvFFUVJmlmZ03bonEmO7tUF5lxsNfJaLEaILFIvD3dQeQcCYPPjo15o3pAsD6hlXbm05uiRFv/5aMP09dbPCakntJSM3D739ZvuByPx7IxPLtKdh58iJe+/lYo16zoKwS93wRj9jTufh+fwYmvB2D5dtOoqLK3PCTW6lKkwW3fRSLka9vRVJ6QZ3nrdydils/+hPv/X4SJ7KLa/wcIMc64+C+xKZSiFb+N1xUVARfX18UFhbCYKi53xDZ+/1oNu5fvRcA8M7tA3DToI6Net5DX+7F5iPWH+T/nNYb91/d1e64EAKT392J5Oxi+OjUKDaaEOClxVfzRqBPmAFCCHyTkI7XfzlWZ5kHAPw9NbhrZCfMGNgBd3wSh4slRjw+oQcWX3eVdM6pCyWY8t5OVJoseOOWSER1DURKTglOZBdj96lc7DhhLZv1DjXgzVsi8fLGo0hIzcMdw8KxdFak9Dpfx6fhH+sPoYOfB2KeGofC8ipMe38XsooqMC0yFGG+eny6MxUalQL/uW84RnYNxK0fxyLxbD4m9g7Gp/cMkfqLMgrKcfdn8Thd/Q9+5sAwPD+tD9r71L4+jytVmS1QKxUN9kY5W3mlGXGp1r+/Y+eLMKFXMP42pgtUSteOMz2vDBPejkGl2YKXZ/TFPVGd7Y5nFJRj8rs7UFxx6f/rNfePwOju7ep8zfJKM+Z8Fod9aQUI8tGhU6CndEdhRIAnXrihT6PKvq3Nu7+fwLu/nwQAhPnq8dNjYxD4lzWufjqQice+2W/3WNf2XpjSLwT9O/jC4KGBQa+Br4cGgd5aeGrVoOaxWASW/nocq3afwep5w2vsx3elGvv+zSBDdiwWgdmfxiE9rwybFl0jLWjXkM92nsarPx+DUgHEPTcBQQZ9jXNW/HEK//71OAAgyEeHNfePQI+/zNxkFJTj35uOIy2vDCEGPUJ89Qg26CEg8E1CGtLz7Gc6urb3wqaFV0Ontt8H6sM/UvDGr8m1jlWjUuCx8T3w8Nhu0KqVSDybh1krYqFSKrDliWvQtb03hBC47p0dSMkpsQtmiWfzcfvHsTBdVv66PPCdyC7GtPd3osos8NFdgzG5XyhSckpw9+fxOF9YAX9PDQrKqyAEYNCr8cyUXpg9LELqJ3KUiiozfjqQiUBvbYO339uk5Zbhva0nsSEpA74eGgwK98PAcD8MivDHwAg/u727nCWnqEKaeYtPzavRLD4w3A9v3ToA3eu5s05u87/eh58PWu/UUyiA5bMHY1pkKADrHXWzP41DQmoeBkX4oXeoAV/Hp6FToCd+XXgNPLSqGq9XZbbgwdV7sT35Agx6Nf77cBR6BvvgxwOZeP2XY8guMgIAlt7cH3cMj3DeN+oEOcUVsFisd0P+VXJWMW74wPpvy89Tg4KyKozqFojVfxsOtco6c3zwXAFu/SgWRpMF0yJDUWY0YVfKRVSZa3+b06mVeGVGP9w2LFzW76s1qqgyY/F/k/DLoSwAtf8Ce6UYZKoxyDSdxSKgUKBJv5Gn55Vh8rs7MLV/KN68dUCt52QVVmDC23/Az1OLNfePQOd2Xk0al9ki8NuRLHy68zT2pRVAoQDWPjASI2r5LcBktuDWj2OxP60AGpUCXdt5o0ewN3oE+WBaZAi6B9kHqPv/swe/H8vBtMhQRN85GDEnLmDuFwnw1qnx53PjYdBfCnRf7ErFyxuPAgCemtQT86/tbvdab/+WjA+2pSDYoMM7tw3Egm/2I6+0Et3ae+Gr+0fgQrER/1h/CIczigAAfcMMmNIvBKO6t0NkB1/ph7Ltez5fWA6lQoEwP48Gr1FhWRW+jDuDlbvPILe0EgoF8PncofWGmfOF5fhgWwr+uyfdLqBdzlOrwv/d2Be3DunokJkaIQQKy6uQUVAOk1nAS6eGj14Nb50alSYLfj2ShR+TMhGXmovLf0KF+epxzVXtER7giY/+OIViowlatRJ/v+4q3H91V6fPzthCsEJhXW9py9FsaFVKrLpvGEZ1b4fo7Sl4c3MyvLQq/LLwagR4aXHdsh3IKqrAQ2O74rkpve1ez1auXL8/A3qNEl/NG4Ghl5V7S40mLN10HF/GnYWPXo2ti8fW+kuDM1WZLUjOKkaor77G7Ehjnc0txYfbT+G7feegVCrw/h0DMblfqHTcZLZg1oo/ceBcISb2DsbTk3tiZvRulFWa8fDYbnh2Si9kFVZgRvQuZBcZMb5XED69ZyhUSgWKKqqw/XgOthzNRmZBOYoqTCgqr0JheRWM1cHYXUJhpcmC5dtOQq9V4d5RnV02m5RXWokHVu9F4tl8aFQKvHFLZKNn75uCQaYag4zzmC0CCqDe2YXcEiM8tepafxNtioPnCmAR1t/I61JRZUZ2UQU6+HnYhYPaHM8qwpT3dkIIYONjY/DG5mTsOHEBfxvdBS9O72N3rhAC//nzDNQqJeaMiKjxxl5RZcaU93ba3YUV2dEXq+4bjgAvLQDrD+bVsWfx9m/JKL2s78dHp8awLgEQQuBsXhnO5ZWj0mz9YTs3qhOen9an1r6liyVGfLLjNNbEnZVez0urQmmlGT56NX5cMAZd/hIczRaBd7acwCc7T0szHWOvao+FE3tAAWB/WgGS0guQeDYfGdU9PzMGhuHVmf3go69/pu5wRiHW7kmTZg+sP2UEjCYLzhdWILOgHGWVjev1GBzhh6n9QzGuZxC6tfeSrndmQTme+/4QYqpLhUM6+eOLe4c1ehbxSgkhcPOKP7E/rQC3De2IJTdHYsHX+7DpcBa8q1e1/ueGwzBZBN66dQBuGWL9Qb/laDYeWL0XKqUCP8wfjX4drFuIHM0swru/n8BvR7OhUirw6T1Dag2gZovAzOjdOJRRiBsiQ7H8zsE1zskpqsDXCWkoKKtCpdlS3b9mQb8wX8wb0+WKZgCFEDibW4adJy9gx8mLiD2VixKjCR4aFR64piseuqartM9bQ05fKMHy7Sn4ISnTrslfoQBevrEv7q4u032y4xRe/+U4fPRq/L54LIINemw8mIkFX1tLSO/ePhBf7E7FwXOFuCrYG989MqrB/0eFEHjpp6NY9ecZAMCSm/tjdjPDTE5xBU5kleD0xRKcvlCKUxdKUFheBb1GBQ+NCp5aFTy0Kvh7ahHorUU7Lx0CvbUI8dXjqmAfaBr4+QRYFyl96MtE7Dxp7bEL8tHhqUk9MWtwxxp/n0II2UrDZy6W4t6VCTiTWwYfvRof3z0Eo7rVXSa9Egwy1RhkqLEWrd2PDUmZ6BXig+NZxVAqgJinrkV4gGeTX+vPUxdx56fxAICoroH4dO7QWkszOcUV2Hw4C7tTchF7OheF5TXvxtCoFNLU+KAIP0TfOVianakyW/Bl7Fm88/sJqQejV4gPHhnXDdf3CcFdn1ubqq8K9sb6R0dLbzBllSY8/k2StA3F8C4BePL6ntLeWZezWARWxJzCsi0nYLYIdAr0xAezByGyeud0G5PZgs1HsrHqz1Spn6Mh7by10KlVKDGaUGI0SW9mfUINmD4gDDdEhtZ7/YUQWLf3HF7ZeBTFRhPG9WyPz+cOc8rMjK0Xw1OrwvYnxyHYoEdFlRn3rkxA3OlLW3ZMiwzF8tmD7N5YbOWovmEGvHhDH3wUcwrbk62BTKEA3rplAGYNqfs33MMZhbhx+S5YBLDyvmG4tuelTV2zCitw+yexOJtb+117d4/shJdn9G30G50QAul55Yg7nYu407mIT82Tgq2NXqNERZU1DLf30eHJ66/CLUPC6/x7KDWa8ObmZKyOPQNbfhnXsz3mX9sd3+/LwDcJadbrdG03zBrcEVPe2wmjyYJ/z+qP24ddChuvbjyKz3alSp/7e2rww/wxiAhs3L9ZIQRe3ngUK3efAQC8flN/3DkiAhVVZuxLy8fulIs4m1uGmQM71LpVS6nRhNd/OYY18WmN+nq10amV6N/BFwPD/TAwwg+jurWTfuGxyS+txH2r9iApvQAeGhXa+WilMnvfMAMWX3cVyirNOJRRiIPnCnAkowihfnq8ccuAen/Zu1xhWRVWx57BhRIjqswCJrMFJotARZUZBWVVyC+rRGF5FS5WH+/g54FV9w2r0R7gSAwy1RhkqLHO5pZiwtsxUnllct8QfHT3kGa/3tqENGQWlOPRa7tDr2l4BspsETiaWYQ9Z/Kg16jQOdATEYGeCPX1wB/JOXji2yQUVVibpN+7YyBUCgX+76cjOJFdAsD6A+3v11+Fa3sGSW9S2UUVuOGDXbhQbJTeUC+UGHH/f/bi4LlCaNVKvHlLJG4cENbgG1vi2Tw8/k0SMgrKoVEpMDjCHyqlAkqFAgoFkJJTgvOF1gUB1UoFpvYPxYiuAVDAelwBQK1SItRXjzA/D4T66u2uixACFVUWVJotTZ5VOZxRiFkr/oTRZMH8a7vhqUm97I5bLAKf7TqNnw9lwd9TI/VfhRj0GNWtXaPf+GwqqsyYuCwG5/LL8cTEq7BwYg/pWFFFFW7/OA7Hzhch1FePXxdeA19P++/nQrERE5fF2AVXpQKY2j8Uj4zrhr71bPRqY3sT7+jvgS1PjIWHVoXsogppTaaO/h6YMTAMWpUKWrUSxRVVWBFzCkIAC67tjierlyqoTVmlCX+m5GJbcg5iki/UCC4alQJDOvnj6h7tcU2P9ugTZsDmI1lYWt3fBlgD9dxRnTEtMtSuNLs75SKe+e4gzuVbX3Ni7yA8Nr4HBlS/4Qoh8MG2FCyrXhLBoFejqMKEMd3b4ct5w+3+PzWZLZjzWTziU/OgVirw1f0jmtxwKoTAKxuP4Yvd1kA0tJM/DmUUSmUnm/G9gvDiDX2kknj86Vw89b+D0vfbtZ0Xurb3Qtf23ujW3gvtvHWoqLKgrNKEiiozSivNyC+rRG5JJS6WGJFbUomzuaUoqrC/uUGjUuC6PsG4bWg4ru7RHjnFFbj78wSk5JTAz1ODL+4dhr5hBqz+8yze33bSron8rzQqBZ6f2htzR3Wu99937KlcLP5vkvTvtyEDwv3w6T1DEOQjb1mTQaYagww1xT83HMJXcdbfrtY9HNWodXScJT2vDI+sSZR6a2z8PTV4alIv3D6s9t+A957Jwx2fxMFkEbhvdGf8diQbGQXl8PfU4NN7htr1YDSksKwKz3x3EL8eyar1eKCXFnNGRGDOyE4IdnLvxob9GVj0bRIA4MM5gzG1v7XHotRowpPrDmDT4drHrFEp8LcxXfDY+B6Nbmj+OOYUlmw6jmCDDtufHFejV+FiiRFfxp7F9AGhNfqxbNbtTcdT/zsIjUqBWYM74qGx3WqU/+pTajThumUxyCy09tvMG90Fd3wah9MXStHBzwPfPjQSHf3tA9qa+LN4fv1hAMDzU3vjgWu62r3exoOZ+OVQFmJP59o1VmtUCgzo6IeRXQMxomsAhnTyr7U/w2gy48vYs/hgW4oU0nRqJSb3C8HMQR2w+XAW1u5JBwB08PPAkpv745qr2tf6/X27Jw3/WH8YZouAp1aFzYuuqXV27mKJEe9sOYHxvYIwoZ594eojhMBrPx+zm91p76PDmO7tYNCr8XVCGqrMAlqVEg+N7YqySjO+2J0KIazfx5u3RGJUPXeh1cViEUjNLUVSdRl3z5k8HM+6tE5VmK8eFgFkFVUgxKDHl/OG282A5JVW4t3fT+CXQ1no4O+ByA6+6N/BFz1DfLDij1PSv9Np/UOxdFb/GuW2SpMFy7acwMc7rAG3c6Anpg8Ig1qphFqlgEalgE6tgp+nBn6eWvh7auDnoUV4gIdT7mpkkKnGIENNkVNUgZnRu9Er1IDP5w5tcbcgV1SZ8dJPR/FNQhqUCuCukZ2w+Lqr4Oeprfd5q2PP4MUfLm2x0DnQEyvvG96kN04bIQTiU/NwodgIS/WPD4sQ8NKqcc1V7Rs1+yQX2yyFp1aF9Y+OhqdWhQdW78XxrGJoVAo8NaknfD00yCo0IquoHCezS7D3rLUMFuSjwz+m9saMgWGoqLIgLjUXMckXsDvlIipMZoT7eyLc3xMd/T3wyY7TKDaa8OYtkbh1aPPveElKL0Bo9Z15zWFbLkGlVKCjvwfO5pahg58H1j44ss6S3OV39L0xK9J6J1VCGn5MyrDr1+rg54HxvYIwvlcQRnQNaFJjaUFZJb7dk47/JZ7DyZySGsfvieqEpyf3ajA4bj+eg3e3nsSDV3eV7gSTixAC6xLPocxowuju7dA9yFv693/qQgn+78cjUn+KzR3DwvH8tN4N9uM0xdHMIvx3bzrW78+QwmDXdl5YPW94jWDa0PezcvcZvP7LMZgsAl3aeWFyvxB4VzfWe2rV+M+fZ3Aoo1D6Xl64oU+j+5ucgUGmGoMMNZWcjXKOkpCahwAvbaNvOxZC4Kn/HcT/Es9hSCd/fHrP0Bp1+NbAZLZg7soE7E7JRQc/D5RVmpBfVoV23jp8fPdgDOlUc/Zp67FsvLzxqNRT0qWdFzIKymvc6v1XfUIN2PjYGIffOt9Uj3yVKM02hfrqsfbBkegUWHdAFUJg6abj+HjH6RrHurTzwi1DOuL6PsF2b+TNJYTAwXOF+F/iOfxYvRzAkpv613qnYUsnhMDmI1l49edjMFsEXr+pP67tFdTwE5uposqM345m4/j5IvxtTBe0a+YdYfvS8rFgzT5k1lE28vPUYOnN/e3uEmspGGSqMcgQWVksAoczC9E71NCouyTcVX5pJaYv3yX1YPTv4ItP7hmCUN+6b1+vqDLj812pWL4tBeXVK+eG+eoxtqe1ByTQW4dz+WVIzytHen4ZCsoq8cR1VzWql0VuttuO1Uplo5c1EELgH+sP4ZuEdGhVSkzpH4I7hkVgZNcA2UK8bSNXVwe/KyWEgEXA5QsxNkV+aSXW7klHTnEFSipMUnN9kI8eT03qWeu6PS0Bg0w1BhmitudoZhEWrt2PIZ388X839m10uet8YTkSUvPQN8yAbu2vfEbCWcoqTdCqlA0uM3A5i0Vg79l89Ajyhn8rnJ0j98cgU41BhoiIyP009v279c4vExERUavHIENERERui0GGiIiI3BaDDBEREbktBhkiIiJyWwwyRERE5LYYZIiIiMhtMcgQERGR22KQISIiIrfFIENERERui0GGiIiI3BaDDBEREbktBhkiIiJyWwwyRERE5LbUrh6A3IQQAKzbgRMREZF7sL1v297H69Lqg0xxcTEAIDw83MUjISIioqYqLi6Gr69vnccVoqGo4+YsFgsyMzPh4+MDhULhsNctKipCeHg40tPTYTAYHPa6VDteb+fhtXYeXmvn4bV2HkddayEEiouLERYWBqWy7k6YVj8jo1Qq0bFjR9le32Aw8B+FE/F6Ow+vtfPwWjsPr7XzOOJa1zcTY8NmXyIiInJbDDJERETkthhkmkmn0+Ff//oXdDqdq4fSJvB6Ow+vtfPwWjsPr7XzOPtat/pmXyIiImq9OCNDREREbotBhoiIiNwWgwwRERG5LQYZIiIiclsMMs0UHR2Nzp07Q6/XY8SIEUhISHD1kNzekiVLMGzYMPj4+CAoKAgzZ85EcnKy3TkVFRWYP38+AgMD4e3tjVmzZiE7O9tFI249li5dCoVCgUWLFkmP8Vo7TkZGBu666y4EBgbCw8MD/fv3x969e6XjQgi8+OKLCA0NhYeHByZOnIiTJ0+6cMTuyWw244UXXkCXLl3g4eGBbt264ZVXXrHbq4fXunl27NiB6dOnIywsDAqFAhs2bLA73pjrmpeXhzlz5sBgMMDPzw/z5s1DSUnJlQ9OUJOtXbtWaLVa8cUXX4gjR46IBx54QPj5+Yns7GxXD82tTZo0SaxcuVIcPnxYJCUlialTp4qIiAhRUlIinfPwww+L8PBwsXXrVrF3714xcuRIMWrUKBeO2v0lJCSIzp07i8jISLFw4ULpcV5rx8jLyxOdOnUS9957r4iPjxenT58WmzdvFikpKdI5S5cuFb6+vmLDhg3iwIED4sYbbxRdunQR5eXlLhy5+3nttddEYGCg2Lhxo0hNTRXr1q0T3t7e4r333pPO4bVunl9++UU8//zz4vvvvxcAxPr16+2ON+a6Tp48WQwYMEDExcWJnTt3iu7du4vZs2df8dgYZJph+PDhYv78+dLnZrNZhIWFiSVLlrhwVK1PTk6OACBiYmKEEEIUFBQIjUYj1q1bJ51z7NgxAUDExsa6aphurbi4WPTo0UNs2bJFjB07VgoyvNaO88wzz4gxY8bUedxisYiQkBDx5ptvSo8VFBQInU4nvvnmG2cMsdWYNm2a+Nvf/mb32M033yzmzJkjhOC1dpS/BpnGXNejR48KAGLPnj3SOZs2bRIKhUJkZGRc0XhYWmqiyspKJCYmYuLEidJjSqUSEydORGxsrAtH1voUFhYCAAICAgAAiYmJqKqqsrv2vXr1QkREBK99M82fPx/Tpk2zu6YAr7Uj/fjjjxg6dChuvfVWBAUFYdCgQfj000+l46mpqcjKyrK71r6+vhgxYgSvdRONGjUKW7duxYkTJwAABw4cwK5duzBlyhQAvNZyacx1jY2NhZ+fH4YOHSqdM3HiRCiVSsTHx1/R12/1m0Y62sWLF2E2mxEcHGz3eHBwMI4fP+6iUbU+FosFixYtwujRo9GvXz8AQFZWFrRaLfz8/OzODQ4ORlZWlgtG6d7Wrl2Lffv2Yc+ePTWO8Vo7zunTp7FixQosXrwY//jHP7Bnzx48/vjj0Gq1mDt3rnQ9a/uZwmvdNM8++yyKiorQq1cvqFQqmM1mvPbaa5gzZw4A8FrLpDHXNSsrC0FBQXbH1Wo1AgICrvjaM8hQizR//nwcPnwYu3btcvVQWqX09HQsXLgQW7ZsgV6vd/VwWjWLxYKhQ4fi9ddfBwAMGjQIhw8fxkcffYS5c+e6eHSty3//+1+sWbMGX3/9Nfr27YukpCQsWrQIYWFhvNatGEtLTdSuXTuoVKoad29kZ2cjJCTERaNqXRYsWICNGzdi+/bt6Nixo/R4SEgIKisrUVBQYHc+r33TJSYmIicnB4MHD4ZarYZarUZMTAzef/99qNVqBAcH81o7SGhoKPr06WP3WO/evZGWlgYA0vXkz5Qr99RTT+HZZ5/FHXfcgf79++Puu+/GE088gSVLlgDgtZZLY65rSEgIcnJy7I6bTCbk5eVd8bVnkGkirVaLIUOGYOvWrdJjFosFW7duRVRUlAtH5v6EEFiwYAHWr1+Pbdu2oUuXLnbHhwwZAo1GY3ftk5OTkZaWxmvfRBMmTMChQ4eQlJQkfQwdOhRz5syR/sxr7RijR4+usYzAiRMn0KlTJwBAly5dEBISYneti4qKEB8fz2vdRGVlZVAq7d/WVCoVLBYLAF5ruTTmukZFRaGgoACJiYnSOdu2bYPFYsGIESOubABX1CrcRq1du1bodDqxatUqcfToUfHggw8KPz8/kZWV5eqhubVHHnlE+Pr6ij/++EOcP39e+igrK5POefjhh0VERITYtm2b2Lt3r4iKihJRUVEuHHXrcfldS0LwWjtKQkKCUKvV4rXXXhMnT54Ua9asEZ6enuKrr76Szlm6dKnw8/MTP/zwgzh48KCYMWMGbwluhrlz54oOHTpIt19///33ol27duLpp5+WzuG1bp7i4mKxf/9+sX//fgFALFu2TOzfv1+cPXtWCNG46zp58mQxaNAgER8fL3bt2iV69OjB269d6YMPPhARERFCq9WK4cOHi7i4OFcPye0BqPVj5cqV0jnl5eXi0UcfFf7+/sLT01PcdNNN4vz5864bdCvy1yDDa+04P/30k+jXr5/Q6XSiV69e4pNPPrE7brFYxAsvvCCCg4OFTqcTEyZMEMnJyS4arfsqKioSCxcuFBEREUKv14uuXbuK559/XhiNRukcXuvm2b59e60/n+fOnSuEaNx1zc3NFbNnzxbe3t7CYDCI++67TxQXF1/x2BRCXLbkIREREZEbYY8MERERuS0GGSIiInJbDDJERETkthhkiIiIyG0xyBAREZHbYpAhIiIit8UgQ0RERG6LQYaIiIjcFoMMETlM586d8e677zb6/D/++AMKhaLG5pStVVOvDxE1TO3qARCR64wbNw4DBw502Jvrnj174OXl1ejzR40ahfPnz8PX19chX5+I2h4GGSKqlxACZrMZanXDPy7at2/fpNfWarUICQlp7tCIiFhaImqr7r33XsTExOC9996DQqGAQqHAmTNnpHLPpk2bMGTIEOh0OuzatQunTp3CjBkzEBwcDG9vbwwbNgy///673Wv+tXSiUCjw2Wef4aabboKnpyd69OiBH3/8UTr+19LSqlWr4Ofnh82bN6N3797w9vbG5MmTcf78eek5JpMJjz/+OPz8/BAYGIhnnnkGc+fOxcyZM+v9fnft2oWrr74aHh4eCA8Px+OPP47S0lK7sb/yyiuYPXs2vLy80KFDB0RHR9u9RlpaGmbMmAFvb28YDAbcdtttyM7Otjvnp59+wrBhw6DX69GuXTvcdNNNdsfLysrwt7/9DT4+PoiIiMAnn3xS77iJqH4MMkRt1HvvvYeoqCg88MADOH/+PM6fP4/w8HDp+LPPPoulS5fi2LFjiIyMRElJCaZOnYqtW7di//79mDx5MqZPn460tLR6v85LL72E2267DQcPHsTUqVMxZ84c5OXl1Xl+WVkZ3nrrLXz55ZfYsWMH0tLS8OSTT0rH//3vf2PNmjVYuXIldu/ejaKiImzYsKHeMZw6dQqTJ0/GrFmzcPDgQXz77bfYtWsXFixYYHfem2++iQEDBmD//v149tlnsXDhQmzZsgUAYLFYMGPGDOTl5SEmJgZbtmzB6dOncfvtt0vP//nnn3HTTTdh6tSp2L9/P7Zu3Yrhw4fbfY23334bQ4cOxf79+/Hoo4/ikUceQXJycr3jJ6J6XPH+2UTktsaOHSsWLlxo99j27dsFALFhw4YGn9+3b1/xwQcfSJ936tRJvPPOO9LnAMQ///lP6fOSkhIBQGzatMnua+Xn5wshhFi5cqUAIFJSUqTnREdHi+DgYOnz4OBg8eabb0qfm0wmERERIWbMmFHnOOfNmycefPBBu8d27twplEqlKC8vl8Y+efJku3Nuv/12MWXKFCGEEL/99ptQqVQiLS1NOn7kyBEBQCQkJAghhIiKihJz5sypcxydOnUSd911l/S5xWIRQUFBYsWKFXU+h4jqxxkZIqrV0KFD7T4vKSnBk08+id69e8PPzw/e3t44duxYgzMykZGR0p+9vLxgMBiQk5NT5/menp7o1q2b9HloaKh0fmFhIbKzs+1mOVQqFYYMGVLvGA4cOIBVq1bB29tb+pg0aRIsFgtSU1Ol86KiouyeFxUVhWPHjgEAjh07hvDwcLtZqz59+sDPz086JykpCRMmTKh3LJdfD4VCgZCQkHqvBxHVj82+RFSrv9599OSTT2LLli1466230L17d3h4eOCWW25BZWVlva+j0WjsPlcoFLBYLE06XwjRxNHbKykpwUMPPYTHH3+8xrGIiIgreu3LeXh4NHhOU68HEdWPMzJEbZhWq4XZbG7Uubt378a9996Lm266Cf3790dISAjOnDkj7wD/wtfXF8HBwdizZ4/0mNlsxr59++p93uDBg3H06FF07969xodWq5XOi4uLs3teXFwcevfuDQDo3bs30tPTkZ6eLh0/evQoCgoK0KdPHwDW2ZatW7de8fdJRI3HGRmiNqxz586Ij4/HmTNn4O3tjYCAgDrP7dGjB77//ntMnz4dCoUCL7zwgktmEh577DEsWbIE3bt3R69evfDBBx8gPz8fCoWizuc888wzGDlyJBYsWID7778fXl5eOHr0KLZs2YLly5dL5+3evRtvvPEGZs6ciS1btmDdunX4+eefAQATJ05E//79MWfOHLz77rswmUx49NFHMXbsWKkM969//QsTJkxAt27dcMcdd8BkMuGXX37BM888I+9FIWrDOCND1IY9+eSTUKlU6NOnD9q3b19vv8uyZcvg7++PUaNGYfr06Zg0aRIGDx7sxNFaPfPMM5g9ezbuueceREVFSf0uer2+zudERkYiJiYGJ06cwNVXX41BgwbhxRdfRFhYmN15f//737F3714MGjQIr776KpYtW4ZJkyYBsJaAfvjhB/j7++Oaa67BxIkT0bVrV3z77bfS88eNG4d169bhxx9/xMCBAzF+/HgkJCTIcyGICACgEFdafCYiciGLxYLevXvjtttuwyuvvNLs1+ncuTMWLVqERYsWOW5wRCQ7lpaIyK2cPXsWv/32G8aOHQuj0Yjly5cjNTUVd955p6uHRkQuwNISEbkVpVKJVatWYdiwYRg9ejQOHTqE33//XWrKJaK2haUlIiIicluckSEiIiK3xSBDREREbotBhoiIiNwWgwwRERG5LQYZIiIiclsMMkREROS2GGSIiIjIbTHIEBERkdv6f6672fOwAKpJAAAAAElFTkSuQmCC",
      "text/plain": [
       "<Figure size 640x480 with 1 Axes>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 为对比学习负采样准备词频率分布\n",
    "vocab_size = len(dataset.token2id)\n",
    "embed_size = 128\n",
    "distribution = dataset.get_word_distribution()\n",
    "print(distribution)\n",
    "model = SkipGramNCE(vocab_size, embed_size, distribution)\n",
    "\n",
    "from torch.utils.data import DataLoader\n",
    "from torch.optim import SGD, Adam\n",
    "\n",
    "# 定义静态方法collate_batch批量处理数据，转化为PyTorch可以需要的张量类型\n",
    "class DataCollator:\n",
    "    @classmethod\n",
    "    def collate_batch(cls, batch):\n",
    "        batch = np.array(batch)\n",
    "        input_ids = torch.tensor(batch[:, 0], dtype=torch.long)\n",
    "        labels = torch.tensor(batch[:, 1], dtype=torch.long)\n",
    "        return {'input_ids': input_ids, 'labels': labels}\n",
    "\n",
    "# 定义训练参数以及训练循环\n",
    "epochs = 100\n",
    "batch_size = 128\n",
    "learning_rate = 1e-3\n",
    "epoch_loss = []\n",
    "\n",
    "data_collator = DataCollator()\n",
    "dataloader = DataLoader(data, batch_size=batch_size, shuffle=True,\\\n",
    "    collate_fn=data_collator.collate_batch)\n",
    "optimizer = Adam(model.parameters(), lr=learning_rate)\n",
    "model.zero_grad()\n",
    "model.train()\n",
    "\n",
    "# 需要提前安装tqdm\n",
    "from tqdm import trange\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "# 训练过程，每步读取数据，送入模型计算损失，并使用PyTorch进行优化\n",
    "with trange(epochs, desc='epoch', ncols=60) as pbar:\n",
    "    for epoch in pbar:\n",
    "        for step, batch in enumerate(dataloader):\n",
    "            loss = model(**batch)\n",
    "            pbar.set_description(f'epoch-{epoch}, loss={loss.item():.4f}')\n",
    "            loss.backward()\n",
    "            optimizer.step()\n",
    "            model.zero_grad()\n",
    "        epoch_loss.append(loss.item())\n",
    "    \n",
    "epoch_loss = np.array(epoch_loss)\n",
    "plt.plot(range(len(epoch_loss)), epoch_loss)\n",
    "plt.xlabel('training epoch')\n",
    "plt.ylabel('loss')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c9430e9a",
   "metadata": {},
   "source": [
    "TF-IDF加权\n",
    "\n",
    "定义词频率（term frequency）。注意到不同长度的文章词频率会有较大差距，不利于比较和运算，因此可以对词频率取对数。\n",
    "\n",
    "$$\\text{tf}_{t,d} = \\log (\\text{count}(t,d) + 1)$$\n",
    "\n",
    "其中$\\text{count}(t,d)$表示词$t$在文档$d$中出现的次数，为了避免对0取对数，把所有的计数加1。\n",
    "\n",
    "那么如何区分高频词与低频词呢？TF-IDF引入了另一个重要的评价指标——文档频率（document frequency），即一个词在语料库所包含的多少篇文档中出现。在所有文档里出现的词往往是虚词或是常见实词，而只在少量文档里出现的词往往是具有明确含义的实词并且具有很强的文档区分度。用$\\text{df}_t$来表示在多少篇文档中出现了词$t$。\n",
    "\n",
    "为了压低高频词和提升低频词的影响，TF-IDF使用文档频率的倒数，也就是逆向文档频率（inverse document frequency）来对词频率进行加权。这很好理解，一个词的文档频率越高，其倒数就越小，权重就越小。\n",
    "\n",
    "$$\\text{idf}_t = \\log \\frac{N}{\\text{df}_t}$$\n",
    "\n",
    "其中$N$表示文档总数。为了避免分母为0，通常会将分母改为$\\text{df}_t+1$。\n",
    "\n",
    "基于词频率和逆向文档频率，得到TF-IDF的最终值为：\n",
    "\n",
    "$$w_{t,d} = \\text{tf}_{t,d} \\times \\text{idf}_{t}$$\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f765e353",
   "metadata": {},
   "source": [
    "很多情况下会额外对文档的TF-IDF向量使用L2归一化，使得不同文档的TF-IDF向量具有相同的模长，便于相互比较。\n",
    "下面给出了TF-IDF的代码实现。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "9ce8e610",
   "metadata": {},
   "outputs": [],
   "source": [
    "class TFIDF:\n",
    "    def __init__(self, vocab_size, norm='l2', smooth_idf=True,\\\n",
    "                 sublinear_tf=True):\n",
    "        self.vocab_size = vocab_size\n",
    "        self.norm = norm\n",
    "        self.smooth_idf = smooth_idf\n",
    "        self.sublinear_tf = sublinear_tf\n",
    "    \n",
    "    def fit(self, X):\n",
    "        doc_freq = np.zeros(self.vocab_size, dtype=np.float64)\n",
    "        for data in X:\n",
    "            for token_id in set(data):\n",
    "                doc_freq[token_id] += 1\n",
    "        doc_freq += int(self.smooth_idf)\n",
    "        n_samples = len(X) + int(self.smooth_idf)\n",
    "        self.idf = np.log(n_samples / doc_freq) + 1\n",
    "    \n",
    "    def transform(self, X):\n",
    "        assert hasattr(self, 'idf')\n",
    "        term_freq = np.zeros((len(X), self.vocab_size), dtype=np.float64)\n",
    "        for i, data in enumerate(X):\n",
    "            for token in data:\n",
    "                term_freq[i, token] += 1\n",
    "        if self.sublinear_tf:\n",
    "            term_freq = np.log(term_freq + 1)\n",
    "        Y = term_freq * self.idf\n",
    "        if self.norm:\n",
    "            row_norm = (Y**2).sum(axis=1)\n",
    "            row_norm[row_norm == 0] = 1\n",
    "            Y /= np.sqrt(row_norm)[:, None]\n",
    "        return Y\n",
    "    \n",
    "    def fit_transform(self, X):\n",
    "        self.fit(X)\n",
    "        return self.transform(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9379c8b3-33b8-46af-a935-4f09eb35eb4d",
   "metadata": {},
   "outputs": [],
   "source": [
    "   pip install gensim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "2cc266c6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting nltk\n",
      "  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)\n",
      "Collecting click (from nltk)\n",
      "  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)\n",
      "Collecting joblib (from nltk)\n",
      "  Downloading joblib-1.5.0-py3-none-any.whl.metadata (5.6 kB)\n",
      "Collecting regex>=2021.8.3 (from nltk)\n",
      "  Downloading regex-2024.11.6-cp39-cp39-win_amd64.whl.metadata (41 kB)\n",
      "Requirement already satisfied: tqdm in d:\\promgfile\\anaconda\\envs\\pytorch\\lib\\site-packages (from nltk) (4.67.1)\n",
      "Requirement already satisfied: colorama in d:\\promgfile\\anaconda\\envs\\pytorch\\lib\\site-packages (from click->nltk) (0.4.6)\n",
      "Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)\n",
      "   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--\n",
      "   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--\n",
      "   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--\n",
      "   -------------------- ------------------- 0.8/1.5 MB 1.5 MB/s eta 0:00:01\n",
      "   -------------------- ------------------- 0.8/1.5 MB 1.5 MB/s eta 0:00:01\n",
      "   -------------------- ------------------- 0.8/1.5 MB 1.5 MB/s eta 0:00:01\n",
      "   --------------------------- ------------ 1.0/1.5 MB 883.6 kB/s eta 0:00:01\n",
      "   --------------------------- ------------ 1.0/1.5 MB 883.6 kB/s eta 0:00:01\n",
      "   --------------------------- ------------ 1.0/1.5 MB 883.6 kB/s eta 0:00:01\n",
      "   --------------------------- ------------ 1.0/1.5 MB 883.6 kB/s eta 0:00:01\n",
      "   ---------------------------------- ----- 1.3/1.5 MB 610.3 kB/s eta 0:00:01\n",
      "   ---------------------------------- ----- 1.3/1.5 MB 610.3 kB/s eta 0:00:01\n",
      "   ---------------------------------- ----- 1.3/1.5 MB 610.3 kB/s eta 0:00:01\n",
      "   ---------------------------------------- 1.5/1.5 MB 548.9 kB/s eta 0:00:00\n",
      "Downloading regex-2024.11.6-cp39-cp39-win_amd64.whl (274 kB)\n",
      "Downloading click-8.1.8-py3-none-any.whl (98 kB)\n",
      "Downloading joblib-1.5.0-py3-none-any.whl (307 kB)\n",
      "Installing collected packages: regex, joblib, click, nltk\n",
      "Successfully installed click-8.1.8 joblib-1.5.0 nltk-3.9.1 regex-2024.11.6\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  WARNING: The script nltk.exe is installed in 'D:\\Promgfile\\Anaconda\\envs\\pytorch\\Scripts' which is not on PATH.\n",
      "  Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n"
     ]
    }
   ],
   "source": [
    "   pip install nltk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0f65f2f0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pytorch",
   "language": "python",
   "name": "env_name"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.21"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
